Loading libraries

library(sf)
library(dplyr)
library(tidyr)
library(janitor)
library(ggplot2)
library(viridis)
library(leaflet)
provinces_full <- readRDS("../data_clean/provinces_full.rds")

1. basic summary of the variables

prov_df <- prov %>% st_drop_geometry()

summary_stats <- prov_df %>%
  summarise(
    mort_mean = mean(diabetes_mort_rate, na.rm = TRUE),
    mort_sd   = sd(diabetes_mort_rate, na.rm = TRUE),
    mort_min  = min(diabetes_mort_rate, na.rm = TRUE),
    mort_max  = max(diabetes_mort_rate, na.rm = TRUE),
    
    age_mean  = mean(pct_65plus, na.rm = TRUE),
    unemp_mean = mean(unemployment_rate, na.rm = TRUE),
    edu_mean  = mean(low_education_share, na.rm = TRUE),
    nutr_mean = mean(adequate_nutrition, na.rm = TRUE),
    sed_mean  = mean(sedentariness, na.rm = TRUE)
  )

summary_stats
##   mort_mean  mort_sd mort_min mort_max age_mean unemp_mean edu_mean nutr_mean sed_mean
## 1  4.118037 1.440078     1.85     9.08 25.00153   6.677508 48.26879  16.89252 33.74673

3. Which provinces are most affected? Which are least?

## 3.1 Diabetes mortality rate ####
# most affective:
prov_df %>%
  arrange(desc(diabetes_mort_rate)) %>%
  select(prov_name, diabetes_mort_rate) %>%
  slice(1:10)
##             prov_name diabetes_mort_rate
## 1           Agrigento               9.08
## 2                Enna               7.71
## 3  Reggio di Calabria               7.35
## 4             Isernia               7.29
## 5             Crotone               7.15
## 6             Cosenza               6.78
## 7             Trapani               6.35
## 8           Catanzaro               6.33
## 9       Caltanissetta               6.32
## 10            Potenza               6.12
# least effective:
prov_df %>%
  arrange(diabetes_mort_rate) %>%
  select(prov_name, diabetes_mort_rate) %>%
  slice(1:10)
##                prov_name diabetes_mort_rate
## 1                Bolzano               1.85
## 2                  Lecco               2.01
## 3                 Trento               2.11
## 4                  Aosta               2.20
## 5                Sondrio               2.24
## 6  Monza e della Brianza               2.27
## 7                Brescia               2.33
## 8                  Prato               2.43
## 9     Reggio nell'Emilia               2.46
## 10               Bergamo               2.48
# we can already see that provinces located in the south have greater diabetes mortality rate.

## 3.2 Ageing ####
# most affective:
prov_df %>%
  arrange(desc(pct_65plus)) %>%
  select(prov_name, pct_65plus) %>%
  slice(1:10)
##      prov_name pct_65plus
## 1       Biella    29.8925
## 2       Savona    29.7446
## 3       Genova    29.0450
## 4     Oristano    28.8914
## 5     Grosseto    28.7334
## 6      Ferrara    28.7055
## 7      Trieste    28.6880
## 8        Terni    28.5791
## 9  Alessandria    28.4154
## 10     Imperia    28.3253
# least effective:
prov_df %>%
  arrange(pct_65plus) %>%
  select(prov_name, pct_65plus) %>%
  slice(1:10)
##                prov_name pct_65plus
## 1                Caserta    18.9922
## 2                 Napoli    19.5782
## 3                Bolzano    20.2766
## 4  Barletta-Andria-Trani    21.1191
## 5                 Ragusa    21.2972
## 6                Catania    21.5232
## 7                Crotone    22.0505
## 8                Bergamo    22.1082
## 9                Salerno    22.3033
## 10                  Lodi    22.3805
## 3.3 Unemployment ####

# most affective:
prov_df %>%
  arrange(desc(unemployment_rate)) %>%
  select(prov_name, unemployment_rate) %>%
  slice(1:10)
##        prov_name unemployment_rate
## 1         Napoli          20.36696
## 2        Messina          16.44512
## 3         Foggia          16.34486
## 4      Agrigento          16.20707
## 5      Catanzaro          15.93901
## 6       Siracusa          15.46265
## 7        Palermo          14.66749
## 8        Cosenza          14.11287
## 9  Caltanissetta          13.99437
## 10 Vibo Valentia          13.60034
# least effective:
prov_df %>%
  arrange(unemployment_rate) %>%
  select(prov_name, unemployment_rate) %>%
  slice(1:10)
##    prov_name unemployment_rate
## 1    Bergamo          1.545735
## 2  Pordenone          1.782660
## 3    Cremona          1.900585
## 4    Bolzano          1.990011
## 5    Treviso          2.327501
## 6      Prato          2.553198
## 7     Padova          2.558400
## 8       Lodi          2.582747
## 9     Verona          2.589108
## 10   Firenze          2.688453
# same thing: unemplyment rate is clearly greater in the south

## 3.4 education ####
edu_region <- prov_df %>%
  distinct(region_name, low_education_share)
# Most affected (highest low education share)
edu_region %>%
  arrange(desc(low_education_share)) %>%
  slice(1:10)
##      region_name low_education_share
## 1         Puglia            56.62414
## 2       Sardegna            55.62087
## 3        Sicilia            55.44252
## 4       Calabria            53.77098
## 5       Campania            52.95224
## 6  Valle d'Aosta            49.14710
## 7         Molise            48.90983
## 8       Piemonte            48.33849
## 9     Basilicata            47.95307
## 10       Toscana            47.44715
# Least affected (lowest low education share)
edu_region %>%
  arrange(low_education_share) %>%
  slice(1:10)
##              region_name low_education_share
## 1                  Lazio            40.12169
## 2                 Umbria            42.54312
## 3  Friuli-Venezia Giulia            42.82400
## 4                Liguria            43.59771
## 5    Trentino-Alto Adige            43.92755
## 6         Emilia-Romagna            44.06597
## 7                Abruzzo            44.96981
## 8              Lombardia            45.81564
## 9                 Veneto            47.02004
## 10                Marche            47.12458
## 3.5 Nutrition and sedentariness ####
nut_region <- prov_df %>%
  distinct(region_name, adequate_nutrition)

# Best (highest adequate nutrition)
nut_region %>%
  arrange(desc(adequate_nutrition)) %>%
  slice(1:10)
##              region_name adequate_nutrition
## 1               Piemonte               25.4
## 2                 Marche               23.0
## 3         Emilia-Romagna               21.9
## 4  Friuli-Venezia Giulia               19.4
## 5          Valle d'Aosta               18.9
## 6                Liguria               18.8
## 7                Toscana               18.8
## 8              Lombardia               18.1
## 9                  Lazio               17.9
## 10   Trentino-Alto Adige               17.6
# Worst (lowest adequate nutrition)
nut_region %>%
  arrange(adequate_nutrition) %>%
  slice(1:10)
##    region_name adequate_nutrition
## 1   Basilicata                7.1
## 2     Campania                9.9
## 3      Sicilia               10.1
## 4       Puglia               11.1
## 5       Molise               11.6
## 6     Calabria               12.9
## 7      Abruzzo               14.4
## 8       Veneto               14.6
## 9       Umbria               17.2
## 10    Sardegna               17.5
# the same here, the south has the lowest adequate nutrition

sed_region <- prov_df %>%
  distinct(region_name, sedentariness)

# Best (highest sedentariness)
sed_region %>%
  arrange(desc(sedentariness)) %>%
  slice(1:10)
##    region_name sedentariness
## 1   Basilicata          53.7
## 2     Campania          53.1
## 3      Sicilia          52.5
## 4       Puglia          48.6
## 5     Calabria          48.2
## 6       Molise          38.9
## 7     Sardegna          34.8
## 8        Lazio          32.0
## 9      Abruzzo          31.5
## 10      Umbria          30.5
# Worst (lowest sedentariness)
sed_region %>%
  arrange(sedentariness) %>%
  slice(1:10)
##              region_name sedentariness
## 1    Trentino-Alto Adige          13.8
## 2  Friuli-Venezia Giulia          22.6
## 3                 Veneto          23.1
## 4              Lombardia          25.5
## 5         Emilia-Romagna          26.2
## 6          Valle d'Aosta          26.4
## 7                 Marche          28.8
## 8               Piemonte          29.1
## 9                Toscana          29.1
## 10               Liguria          29.6
# the same 

4. cross table analysis with mortality

# Helper to create quantile-based groups with nice labels
qcut <- function(x, probs, labels) {
  cut(x,
      breaks = quantile(x, probs = probs, na.rm = TRUE),
      include.lowest = TRUE,
      labels = labels)
}

prov_cat <- prov_df %>%
  mutate(
    # Outcome: mortality quartiles
    mort_q = qcut(diabetes_mort_rate,
                  probs  = c(0, .25, .50, .75, 1),
                  labels = c("Q1 (lowest)", "Q2", "Q3", "Q4 (highest)")),
    
    # Predictors: tertiles (Low/Medium/High)
    age_ter = qcut(pct_65plus,
                   probs  = c(0, 1/3, 2/3, 1),
                   labels = c("Low", "Medium", "High")),
    
    unemp_ter = qcut(unemployment_rate,
                     probs  = c(0, 1/3, 2/3, 1),
                     labels = c("Low", "Medium", "High")),
    
    edu_ter = qcut(low_education_share,
                   probs  = c(0, 1/3, 2/3, 1),
                   labels = c("Low", "Medium", "High")),
    
    nutr_ter = qcut(adequate_nutrition,
                    probs  = c(0, 1/3, 2/3, 1),
                    labels = c("Low", "Medium", "High")),
    
    sed_ter = qcut(sedentariness,
                   probs  = c(0, 1/3, 2/3, 1),
                   labels = c("Low", "Medium", "High"))
  )


crosstab_mort <- function(data, var) {
  data %>%
    count(mort_q, {{ var }}) %>%
    group_by(mort_q) %>%
    mutate(pct_within_mort = n / sum(n) * 100) %>%
    ungroup() %>%
    arrange(mort_q, {{ var }})
}

4.1 Ageing

tab_age  <- crosstab_mort(prov_cat, age_ter)
tab_age
## # A tibble: 12 × 4
##    mort_q       age_ter     n pct_within_mort
##    <fct>        <fct>   <int>           <dbl>
##  1 Q1 (lowest)  Low        15           55.6 
##  2 Q1 (lowest)  Medium     10           37.0 
##  3 Q1 (lowest)  High        2            7.41
##  4 Q2           Low         5           17.9 
##  5 Q2           Medium     10           35.7 
##  6 Q2           High       13           46.4 
##  7 Q3           Low         3           12   
##  8 Q3           Medium      5           20   
##  9 Q3           High       17           68   
## 10 Q4 (highest) Low        13           48.1 
## 11 Q4 (highest) Medium     10           37.0 
## 12 Q4 (highest) High        4           14.8

Here the question is: “within each diabetes mortality quartile, how are provinces distributed across age structure (Low / Medium / High %65+)?”

  • Q1: among provinces with the lowest diabetes mortality, more than half have a low share of elderly, and very few have a high share of elderly.

  • Q2: in slightly higher mortality provinces, the majority already have medium-to-high ageing.

  • Q3: for provinces in the third mortality quartile, over two thirds belong to the highest ageing group

  • Q4: descending

Provinces with very high diabetes mortality are not dominated by highly aged populations. Ageing explains diabetes mortality up to a point, but cannot explain the highest mortality levels alone.

4.2 Unemployment

tab_unem <- crosstab_mort(prov_cat, unemp_ter)
tab_unem
## # A tibble: 11 × 4
##    mort_q       unemp_ter     n pct_within_mort
##    <fct>        <fct>     <int>           <dbl>
##  1 Q1 (lowest)  Low          19           70.4 
##  2 Q1 (lowest)  Medium        6           22.2 
##  3 Q1 (lowest)  High          2            7.41
##  4 Q2           Low          12           42.9 
##  5 Q2           Medium       12           42.9 
##  6 Q2           High          4           14.3 
##  7 Q3           Low           5           20   
##  8 Q3           Medium       12           48   
##  9 Q3           High          8           32   
## 10 Q4 (highest) Medium        5           18.5 
## 11 Q4 (highest) High         22           81.5
  • Q1: 70.4% Low unemployment: Very few high-unemployment provinces

  • Q2: Balanced Low / Medium unemployment, Still few high-unemployment provinces

  • Q3: 32% High unemployment, Nearly half Medium, Unemployment starts to matter more

  • Q4: 81.5% High unemployment, Almost no low-unemployment provinces

Provinces with the highest diabetes mortality are overwhelmingly concentrated in the highest unemployment tertile, suggesting a strong socio-economic gradient that is not explained by age alone.

4.3 Education

tab_edu  <- crosstab_mort(prov_cat, edu_ter)
tab_edu
## # A tibble: 12 × 4
##    mort_q       edu_ter     n pct_within_mort
##    <fct>        <fct>   <int>           <dbl>
##  1 Q1 (lowest)  Low        18           66.7 
##  2 Q1 (lowest)  Medium      7           25.9 
##  3 Q1 (lowest)  High        2            7.41
##  4 Q2           Low        11           39.3 
##  5 Q2           Medium     15           53.6 
##  6 Q2           High        2            7.14
##  7 Q3           Low        10           40   
##  8 Q3           Medium      8           32   
##  9 Q3           High        7           28   
## 10 Q4 (highest) Low         3           11.1 
## 11 Q4 (highest) Medium      2            7.41
## 12 Q4 (highest) High       22           81.5
  • Q1: 66.7% Low low-education share, very few High: better education = lower mortality

  • Q2, Q3: Increasing presence of Medium & High, gradient is visible but smoother

  • Q4: 81.5% High low-education share –> Strong structural association

The highest mortality quartile is strongly associated with regions exhibiting a high proportion of low-educated population, highlighting the role of long-term structural socio-economic disadvantages.

4.4 Nutrition

tab_nutr <- crosstab_mort(prov_cat, nutr_ter)
tab_nutr
## # A tibble: 11 × 4
##    mort_q       nutr_ter     n pct_within_mort
##    <fct>        <fct>    <int>           <dbl>
##  1 Q1 (lowest)  Low          3           11.1 
##  2 Q1 (lowest)  Medium      14           51.9 
##  3 Q1 (lowest)  High        10           37.0 
##  4 Q2           Low          4           14.3 
##  5 Q2           Medium      12           42.9 
##  6 Q2           High        12           42.9 
##  7 Q3           Low          8           32   
##  8 Q3           Medium      12           48   
##  9 Q3           High         5           20   
## 10 Q4 (highest) Low         25           92.6 
## 11 Q4 (highest) Medium       2            7.41
  • Q1: Mostly Medium–High nutrition, Only 11% Low nutrition –> nutrition has a protective effect

  • Q2, Q3: Mixed distribution

  • Q4: 92.6% Low adequate nutrition

Provinces with the highest diabetes mortality are almost exclusively located in regions characterized by low levels of adequate nutrition, reinforcing the role of diet-related factors in diabetes-related mortality.

4.5 Sedentariness

tab_sed  <- crosstab_mort(prov_cat, sed_ter)
tab_sed
## # A tibble: 11 × 4
##    mort_q       sed_ter     n pct_within_mort
##    <fct>        <fct>   <int>           <dbl>
##  1 Q1 (lowest)  Low        25           92.6 
##  2 Q1 (lowest)  Medium      1            3.70
##  3 Q1 (lowest)  High        1            3.70
##  4 Q2           Low        10           35.7 
##  5 Q2           Medium     16           57.1 
##  6 Q2           High        2            7.14
##  7 Q3           Low         5           20   
##  8 Q3           Medium     13           52   
##  9 Q3           High        7           28   
## 10 Q4 (highest) Medium      3           11.1 
## 11 Q4 (highest) High       24           88.9
  • Q1: 92.6% Low sedentariness, Low mortality provinces are almost entirely in low-sedentary regions

  • Q2, Q3: Mostly Medium sedentariness, Some High in Q3 –> Clear gradient emerging

  • Q4: 88.9% High sedentariness –> Extremely strong concentration

Provinces in the highest mortality quartile are almost entirely located in regions with high levels of sedentary behavior, indicating that lifestyle-related risk factors play a key role in explaining extreme mortality outcomes.

5. Mapping

5.1 Static Maps

Diabetes Mortality Rate

### 5.1.1 diabetes mortality rate ####
ggplot(provinces_full) +
  geom_sf(aes(fill = diabetes_mort_rate), color = "grey30", size = 0.1) +
  scale_fill_viridis(
    name = "Diabetes mortality rate",
    option = "plasma",
    direction = -1
  ) +
  labs(
    title = "Diabetes mortality rate by province",
    subtitle = "Quantile classification",
    caption = "Source: ISTAT"
  ) +
  theme_minimal()

ggsave("../outputs/diabetes_mortality_rate.png", width = 6, height = 4, dpi = 300)

Ageing

### 5.1.2 Ageing ####
ggplot(provinces_full) +
  geom_sf(aes(fill = pct_65plus), color = "grey30", size = 0.1) +
  scale_fill_viridis(
    name = "65+ (%)",
    option = "magma",
  ) +
  labs(
    title = "Percentage of 65+ by province"
  ) +
  theme_minimal()

ggsave("../outputs/pct_65+.png", width = 6, height = 4, dpi = 300)

Unemployment Rate

### 5.1.3 Unemplyment rate ####
ggplot(provinces_full) +
  geom_sf(aes(fill = unemployment_rate), color = "grey30", size = 0.1) +
  scale_fill_viridis(
    name = "Unemployment rate (%)",
    option = "magma",
    direction = -1
  ) +
  labs(
    title = "Unemployment rate by province"
  ) +
  theme_minimal()

ggsave("../outputs/unemployment_rate.png", width = 6, height = 4, dpi = 300)

Education

### 5.1.4 Low Education Share ####
ggplot(provinces_full) +
  geom_sf(aes(fill = low_education_share), color = "grey30", size = 0.1) +
  scale_fill_viridis(
    name = "Low Education Share (%)",
    option = "mako",
    direction = -1
  ) +
  labs(
    title = "Low Education Share (regional indicator)",
    subtitle = "Same value for provinces within the same region"
  ) +
  theme_minimal()

ggsave("../outputs/low_education_share.png", width = 6, height = 4, dpi = 300)

Adequate Nutrition

### 5.1.4 Adequate Nutrition ####
ggplot(provinces_full) +
  geom_sf(aes(fill = adequate_nutrition), color = "grey30", size = 0.1) +
  scale_fill_viridis(
    name = "Adequate nutrition (%)",
    option = "viridis",
  ) +
  labs(
    title = "Adequate nutrition (regional indicator)",
    subtitle = "Same value for provinces within the same region"
  ) +
  theme_minimal()

ggsave("../outputs/adequate_nutrition.png", width = 6, height = 4, dpi = 300)

Sedentariness

### 5.1.5 Sedentariness ####
ggplot(provinces_full) +
  geom_sf(aes(fill = sedentariness), color = "grey30", size = 0.1) +
  scale_fill_viridis(
    name = "Sedentary population (%)",
    option = "inferno",
    direction = -1
  ) +
  labs(
    title = "Sedentariness (regional indicator)",
    subtitle = "Same value for provinces within the same region"
  ) +
  theme_minimal()

ggsave("../outputs/sedentariness.png", width = 6, height = 4, dpi = 300)

5.2 Interactive Maps

st_crs(provinces_full)
## Coordinate Reference System:
##   User input: WGS 84 / UTM zone 32N 
##   wkt:
## PROJCRS["WGS 84 / UTM zone 32N",
##     BASEGEOGCRS["WGS 84",
##         DATUM["World Geodetic System 1984",
##             ELLIPSOID["WGS 84",6378137,298.257223563,
##                 LENGTHUNIT["metre",1]]],
##         PRIMEM["Greenwich",0,
##             ANGLEUNIT["degree",0.0174532925199433]],
##         ID["EPSG",4326]],
##     CONVERSION["UTM zone 32N",
##         METHOD["Transverse Mercator",
##             ID["EPSG",9807]],
##         PARAMETER["Latitude of natural origin",0,
##             ANGLEUNIT["Degree",0.0174532925199433],
##             ID["EPSG",8801]],
##         PARAMETER["Longitude of natural origin",9,
##             ANGLEUNIT["Degree",0.0174532925199433],
##             ID["EPSG",8802]],
##         PARAMETER["Scale factor at natural origin",0.9996,
##             SCALEUNIT["unity",1],
##             ID["EPSG",8805]],
##         PARAMETER["False easting",500000,
##             LENGTHUNIT["metre",1],
##             ID["EPSG",8806]],
##         PARAMETER["False northing",0,
##             LENGTHUNIT["metre",1],
##             ID["EPSG",8807]]],
##     CS[Cartesian,2],
##         AXIS["(E)",east,
##             ORDER[1],
##             LENGTHUNIT["metre",1]],
##         AXIS["(N)",north,
##             ORDER[2],
##             LENGTHUNIT["metre",1]],
##     ID["EPSG",32632]]
provinces_leaflet <- st_transform(provinces_full, 4326)

mort_breaks <- quantile(
  provinces_leaflet$diabetes_mort_rate,
  probs = seq(0, 1, 0.2),
  na.rm = TRUE
)

pal_mort <- colorBin(
  palette = "YlOrRd",
  domain  = provinces_leaflet$diabetes_mort_rate,
  bins    = mort_breaks
)

# interactive map 
leaflet(provinces_leaflet) %>%
  addProviderTiles("CartoDB.Positron") %>%
  addPolygons(
    fillColor = ~pal_mort(diabetes_mort_rate),
    weight = 0.4,
    fillOpacity = 0.8,
    
    label = ~paste0(prov_name, " – ", region_name),
    
    popup = ~paste0(
      "<b>", prov_name, "</b><br>",
      "<i>", region_name, "</i><br><br>",
      
      "<b>Diabetes mortality:</b> ",
      round(diabetes_mort_rate, 1), "<br><br>",
      
      "<b>Population 65+:</b> ",
      round(pct_65plus, 1), "%<br>",
      
      "<b>Unemployment:</b> ",
      round(unemployment_rate, 1), "%<br><br>",
      
      "<b>Sedentariness (region):</b> ",
      sedentariness, "%<br>",
      
      "<b>Adequate nutrition (region):</b> ",
      adequate_nutrition, "%<br>",
      
      "<b>Low education share (region):</b> ",
      round(low_education_share, 1), "%"
    )
  ) %>%
  addLegend(
    pal = pal_mort,
    values = ~diabetes_mort_rate,
    title = "Diabetes mortality rate<br>(per 10,000)",
    labFormat = labelFormat(digits = 1),
    opacity = 0.9,
    position = "bottomright"
  )